In [2]:
import statsmodels.api as sm 
import pandas as pd
import statsmodels.formula.api as smf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
import numpy as np
In [3]:
df=pd.read_csv('/Users/maha/Downloads/wages_by_education.csv')
In [4]:
df.head()
Out[4]:
year less_than_hs high_school some_college bachelors_degree advanced_degree men_less_than_hs men_high_school men_some_college men_bachelors_degree ... black_women_less_than_hs black_women_high_school black_women_some_college black_women_bachelors_degree black_women_advanced_degree hispanic_women_less_than_hs hispanic_women_high_school hispanic_women_some_college hispanic_women_bachelors_degree hispanic_women_advanced_degree
0 2022 16.52 21.94 24.81 41.60 53.22 17.99 24.08 27.96 49.01 ... 13.89 17.83 20.36 29.94 39.41 14.74 18.18 20.64 31.13 40.64
1 2021 16.74 22.28 24.92 41.32 53.45 18.34 24.36 27.96 47.83 ... 13.73 18.30 20.18 29.35 40.07 14.97 18.34 21.14 31.25 42.47
2 2020 17.02 22.70 25.44 41.65 53.74 18.76 25.09 28.55 48.15 ... 13.66 17.93 20.31 31.38 42.44 14.58 18.50 20.69 31.55 44.15
3 2019 16.11 21.64 24.00 39.61 51.57 17.55 23.99 26.99 45.74 ... 13.30 17.18 19.17 30.31 40.42 14.50 17.71 19.69 30.18 42.30
4 2018 15.94 21.50 23.70 38.87 51.03 17.70 23.72 26.61 44.97 ... 12.48 16.81 18.57 30.85 39.64 13.47 17.28 19.29 29.47 39.35

5 rows × 61 columns

In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 61 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   year                             50 non-null     int64  
 1   less_than_hs                     50 non-null     float64
 2   high_school                      50 non-null     float64
 3   some_college                     50 non-null     float64
 4   bachelors_degree                 50 non-null     float64
 5   advanced_degree                  50 non-null     float64
 6   men_less_than_hs                 50 non-null     float64
 7   men_high_school                  50 non-null     float64
 8   men_some_college                 50 non-null     float64
 9   men_bachelors_degree             50 non-null     float64
 10  men_advanced_degree              50 non-null     float64
 11  women_less_than_hs               50 non-null     float64
 12  women_high_school                50 non-null     float64
 13  women_some_college               50 non-null     float64
 14  women_bachelors_degree           50 non-null     float64
 15  women_advanced_degree            50 non-null     float64
 16  white_less_than_hs               50 non-null     float64
 17  white_high_school                50 non-null     float64
 18  white_some_college               50 non-null     float64
 19  white_bachelors_degree           50 non-null     float64
 20  white_advanced_degree            50 non-null     float64
 21  black_less_than_hs               50 non-null     float64
 22  black_high_school                50 non-null     float64
 23  black_some_college               50 non-null     float64
 24  black_bachelors_degree           50 non-null     float64
 25  black_advanced_degree            50 non-null     float64
 26  hispanic_less_than_hs            50 non-null     float64
 27  hispanic_high_school             50 non-null     float64
 28  hispanic_some_college            50 non-null     float64
 29  hispanic_bachelors_degree        50 non-null     float64
 30  hispanic_advanced_degree         50 non-null     float64
 31  white_men_less_than_hs           50 non-null     float64
 32  white_men_high_school            50 non-null     float64
 33  white_men_some_college           50 non-null     float64
 34  white_men_bachelors_degree       50 non-null     float64
 35  white_men_advanced_degree        50 non-null     float64
 36  black_men_less_than_hs           50 non-null     float64
 37  black_men_high_school            50 non-null     float64
 38  black_men_some_college           50 non-null     float64
 39  black_men_bachelors_degree       50 non-null     float64
 40  black_men_advanced_degree        50 non-null     float64
 41  hispanic_men_less_than_hs        50 non-null     float64
 42  hispanic_men_high_school         50 non-null     float64
 43  hispanic_men_some_college        50 non-null     float64
 44  hispanic_men_bachelors_degree    50 non-null     float64
 45  hispanic_men_advanced_degree     50 non-null     float64
 46  white_women_less_than_hs         50 non-null     float64
 47  white_women_high_school          50 non-null     float64
 48  white_women_some_college         50 non-null     float64
 49  white_women_bachelors_degree     50 non-null     float64
 50  white_women_advanced_degree      50 non-null     float64
 51  black_women_less_than_hs         50 non-null     float64
 52  black_women_high_school          50 non-null     float64
 53  black_women_some_college         50 non-null     float64
 54  black_women_bachelors_degree     50 non-null     float64
 55  black_women_advanced_degree      50 non-null     float64
 56  hispanic_women_less_than_hs      50 non-null     float64
 57  hispanic_women_high_school       50 non-null     float64
 58  hispanic_women_some_college      50 non-null     float64
 59  hispanic_women_bachelors_degree  50 non-null     float64
 60  hispanic_women_advanced_degree   50 non-null     float64
dtypes: float64(60), int64(1)
memory usage: 24.0 KB
In [7]:
fig = px.line(df, x='year', y=['less_than_hs', 'high_school', 'some_college', 'bachelors_degree', 'advanced_degree'],
              labels={'value': 'Wage', 'variable': 'Education Level'},
              title='Wages by Education Level in the USA (1973-2022)')
In [8]:
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Wage',
    legend_title='Education Level',
    hovermode='x'
)

fig.show()

OLS Regression Model (Table #1)¶

In [5]:
df['women_total'] = df[['women_less_than_hs', 'women_high_school', 'women_some_college', 'women_bachelors_degree', 'women_advanced_degree']].sum(axis=1)
df['men_total'] = df[['men_less_than_hs', 'men_high_school', 'men_some_college', 'men_bachelors_degree', 'men_advanced_degree']].sum(axis=1)
In [6]:
sns.scatterplot(x='women_total', y='year', data=df, label='Women')
sns.scatterplot(x='men_total', y='year', data=df, label='Men')


plt.xlabel('Wage')
plt.ylabel('Year')
plt.title('Scatter Plot of Wages by Gender')
plt.legend()
plt.show()
In [7]:
model = smf.ols(formula = "year ~ women_total", data = df).fit()
model.summary()
Out[7]:
OLS Regression Results
Dep. Variable: year R-squared: 0.911
Model: OLS Adj. R-squared: 0.909
Method: Least Squares F-statistic: 493.4
Date: Wed, 06 Dec 2023 Prob (F-statistic): 6.66e-27
Time: 17:34:01 Log-Likelihood: -143.84
No. Observations: 50 AIC: 291.7
Df Residuals: 48 BIC: 295.5
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 1852.4782 6.558 282.474 0.000 1839.292 1865.664
women_total 1.2473 0.056 22.213 0.000 1.134 1.360
Omnibus: 2.036 Durbin-Watson: 0.242
Prob(Omnibus): 0.361 Jarque-Bera (JB): 1.706
Skew: -0.450 Prob(JB): 0.426
Kurtosis: 2.905 Cond. No. 1.24e+03


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.24e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [8]:
model = smf.ols(formula = "year ~ men_total", data = df).fit()
model.summary()
Out[8]:
OLS Regression Results
Dep. Variable: year R-squared: 0.718
Model: OLS Adj. R-squared: 0.712
Method: Least Squares F-statistic: 122.1
Date: Wed, 06 Dec 2023 Prob (F-statistic): 8.70e-15
Time: 17:34:01 Log-Likelihood: -172.78
No. Observations: 50 AIC: 349.6
Df Residuals: 48 BIC: 353.4
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 1809.6309 17.037 106.219 0.000 1775.376 1843.886
men_total 1.1955 0.108 11.051 0.000 0.978 1.413
Omnibus: 2.704 Durbin-Watson: 0.175
Prob(Omnibus): 0.259 Jarque-Bera (JB): 2.486
Skew: -0.534 Prob(JB): 0.289
Kurtosis: 2.766 Cond. No. 2.43e+03


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.43e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [9]:
sns.scatterplot(x='high_school', y='year', data=df, label='High School Diploma')
sns.scatterplot(x='advanced_degree', y='year', data=df, label='Advanced Degree')


plt.xlabel('Wage')
plt.ylabel('Year')
plt.title('Scatter Plot of Wages by Gender')
plt.legend()
plt.show()
In [10]:
model = smf.ols(formula = "year ~ high_school", data = df).fit()
model.summary()
Out[10]:
OLS Regression Results
Dep. Variable: year R-squared: 0.029
Model: OLS Adj. R-squared: 0.009
Method: Least Squares F-statistic: 1.435
Date: Wed, 06 Dec 2023 Prob (F-statistic): 0.237
Time: 17:34:02 Log-Likelihood: -203.68
No. Observations: 50 AIC: 411.4
Df Residuals: 48 BIC: 415.2
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 1927.6802 58.311 33.059 0.000 1810.438 2044.922
high_school 3.3444 2.791 1.198 0.237 -2.268 8.957
Omnibus: 5.666 Durbin-Watson: 0.010
Prob(Omnibus): 0.059 Jarque-Bera (JB): 2.882
Skew: -0.339 Prob(JB): 0.237
Kurtosis: 2.039 Cond. No. 595.


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [11]:
model = smf.ols(formula = "year ~ advanced_degree", data = df).fit()
model.summary()
Out[11]:
OLS Regression Results
Dep. Variable: year R-squared: 0.934
Model: OLS Adj. R-squared: 0.933
Method: Least Squares F-statistic: 679.2
Date: Wed, 06 Dec 2023 Prob (F-statistic): 5.54e-30
Time: 17:34:02 Log-Likelihood: -136.47
No. Observations: 50 AIC: 276.9
Df Residuals: 48 BIC: 280.8
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 1880.9385 4.505 417.564 0.000 1871.882 1889.996
advanced_degree 2.6552 0.102 26.061 0.000 2.450 2.860
Omnibus: 1.376 Durbin-Watson: 0.347
Prob(Omnibus): 0.503 Jarque-Bera (JB): 1.048
Skew: -0.354 Prob(JB): 0.592
Kurtosis: 2.962 Cond. No. 372.


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Figure #1¶

In [11]:
selected_year = [2020,2021,2022]
In [10]:
filtered_df = df[df['year'].isin(selected_year)]
filtered_df
Out[10]:
year less_than_hs high_school some_college bachelors_degree advanced_degree men_less_than_hs men_high_school men_some_college men_bachelors_degree ... black_women_less_than_hs black_women_high_school black_women_some_college black_women_bachelors_degree black_women_advanced_degree hispanic_women_less_than_hs hispanic_women_high_school hispanic_women_some_college hispanic_women_bachelors_degree hispanic_women_advanced_degree
0 2022 16.52 21.94 24.81 41.60 53.22 17.99 24.08 27.96 49.01 ... 13.89 17.83 20.36 29.94 39.41 14.74 18.18 20.64 31.13 40.64
1 2021 16.74 22.28 24.92 41.32 53.45 18.34 24.36 27.96 47.83 ... 13.73 18.30 20.18 29.35 40.07 14.97 18.34 21.14 31.25 42.47
2 2020 17.02 22.70 25.44 41.65 53.74 18.76 25.09 28.55 48.15 ... 13.66 17.93 20.31 31.38 42.44 14.58 18.50 20.69 31.55 44.15

3 rows × 61 columns

In [12]:
filtered_df.plot(x='year', y=['less_than_hs','high_school','some_college','bachelors_degree','advanced_degree'], kind='bar', legend=True)
plt.title('Bar Chart Example')
plt.xlabel('Years')
plt.ylabel('Values')
plt.show()

Figure #2¶

In [13]:
from pandasql import sqldf
In [14]:
df = pd.read_csv('/Users/maha/Downloads/wages_by_education.csv')
In [15]:
pysqldf = sqldf("SELECT * FROM df", globals())
In [16]:
sqldf("SELECT * FROM df")
Out[16]:
year less_than_hs high_school some_college bachelors_degree advanced_degree men_less_than_hs men_high_school men_some_college men_bachelors_degree ... black_women_less_than_hs black_women_high_school black_women_some_college black_women_bachelors_degree black_women_advanced_degree hispanic_women_less_than_hs hispanic_women_high_school hispanic_women_some_college hispanic_women_bachelors_degree hispanic_women_advanced_degree
0 2022 16.52 21.94 24.81 41.60 53.22 17.99 24.08 27.96 49.01 ... 13.89 17.83 20.36 29.94 39.41 14.74 18.18 20.64 31.13 40.64
1 2021 16.74 22.28 24.92 41.32 53.45 18.34 24.36 27.96 47.83 ... 13.73 18.30 20.18 29.35 40.07 14.97 18.34 21.14 31.25 42.47
2 2020 17.02 22.70 25.44 41.65 53.74 18.76 25.09 28.55 48.15 ... 13.66 17.93 20.31 31.38 42.44 14.58 18.50 20.69 31.55 44.15
3 2019 16.11 21.64 24.00 39.61 51.57 17.55 23.99 26.99 45.74 ... 13.30 17.18 19.17 30.31 40.42 14.50 17.71 19.69 30.18 42.30
4 2018 15.94 21.50 23.70 38.87 51.03 17.70 23.72 26.61 44.97 ... 12.48 16.81 18.57 30.85 39.64 13.47 17.28 19.29 29.47 39.35
5 2017 15.92 21.26 23.31 38.65 49.40 17.63 23.47 25.91 44.50 ... 12.99 16.33 18.76 29.19 38.26 13.36 17.02 19.60 29.69 38.43
6 2016 15.45 21.00 23.35 38.73 49.64 16.95 23.01 25.98 44.98 ... 12.42 16.71 18.88 29.56 38.55 13.40 17.11 19.26 29.41 37.77
7 2015 15.25 20.81 23.17 37.96 48.42 16.63 22.93 25.99 43.47 ... 12.79 16.21 18.50 30.42 37.37 13.42 16.99 18.66 29.71 36.37
8 2014 14.66 20.34 22.48 36.47 47.27 15.95 22.40 24.96 41.14 ... 12.47 16.09 18.39 30.20 37.88 12.61 16.55 18.22 28.53 37.16
9 2013 14.62 20.31 22.50 37.00 48.54 15.86 22.28 25.07 42.34 ... 12.41 16.60 18.32 29.57 38.40 12.80 16.23 18.70 28.20 36.92
10 2012 14.80 20.50 22.56 36.76 48.47 16.15 22.65 25.05 42.25 ... 12.20 16.52 18.66 29.43 37.46 12.61 16.41 18.78 27.58 38.91
11 2011 14.96 20.67 22.90 36.44 47.56 16.16 22.81 25.34 41.41 ... 13.40 16.56 18.80 30.49 38.01 13.06 16.56 19.44 28.19 39.59
12 2010 15.04 21.06 23.56 37.42 48.80 16.32 23.28 26.24 42.71 ... 13.63 17.33 19.61 30.53 37.74 12.89 16.85 19.65 28.39 41.27
13 2009 15.41 21.42 23.82 37.30 49.12 16.91 23.74 26.75 43.04 ... 13.23 17.26 19.88 29.59 39.37 13.24 17.43 19.63 29.44 41.34
14 2008 15.08 20.93 23.52 37.04 47.55 16.56 23.30 26.32 42.34 ... 12.78 16.69 19.55 30.50 38.24 13.02 16.70 19.86 28.68 37.34
15 2007 15.19 21.04 23.84 37.23 47.31 16.60 23.39 26.66 42.55 ... 13.08 17.22 20.07 29.77 38.75 12.76 16.91 19.94 29.81 40.13
16 2006 14.90 21.17 23.69 36.99 47.23 16.31 23.55 26.39 41.97 ... 13.25 17.02 19.94 30.53 38.06 12.68 16.90 19.62 28.80 38.93
17 2005 14.96 21.07 23.76 36.89 47.09 16.46 23.34 26.49 41.96 ... 12.90 17.07 19.69 29.25 38.33 12.64 16.96 19.73 28.61 38.19
18 2004 15.13 21.33 24.11 36.78 47.48 16.66 23.74 27.03 41.74 ... 12.81 17.51 20.01 30.28 38.36 12.69 17.24 20.08 29.68 37.50
19 2003 15.27 21.50 24.19 37.14 46.79 16.79 23.89 27.04 42.18 ... 13.13 17.59 20.45 31.02 37.78 13.04 17.38 20.13 28.66 35.91
20 2002 15.18 21.39 24.25 37.26 47.36 16.70 23.92 27.10 42.36 ... 12.84 17.41 20.15 30.11 38.80 12.80 16.66 20.03 28.39 37.85
21 2001 14.88 21.10 24.05 37.07 46.23 16.23 23.67 27.12 42.14 ... 13.04 16.97 19.70 30.11 37.22 12.83 16.88 19.71 28.71 36.79
22 2000 14.67 20.80 23.68 36.44 45.99 16.13 23.48 26.78 41.41 ... 12.95 16.81 19.59 30.18 36.87 12.29 16.53 19.38 28.28 35.66
23 1999 14.61 20.67 23.36 35.76 46.02 16.16 23.31 26.34 40.71 ... 12.72 16.42 19.21 29.54 37.04 12.16 16.38 19.03 28.09 35.95
24 1998 14.57 20.39 23.02 34.97 44.49 16.04 22.99 25.86 39.63 ... 12.57 16.25 19.29 28.68 36.92 12.10 16.27 19.21 27.55 34.36
25 1997 14.07 19.95 22.54 33.44 43.68 15.44 22.58 25.33 37.84 ... 12.25 15.83 18.78 26.60 33.90 11.83 15.66 18.37 27.23 34.91
26 1996 13.95 19.62 22.11 32.50 43.35 15.45 22.11 24.79 36.80 ... 11.77 15.70 17.92 26.03 35.73 11.77 15.53 18.28 26.75 38.63
27 1995 13.95 19.68 22.04 32.86 43.27 15.39 22.20 24.78 37.07 ... 12.07 15.53 18.59 27.99 37.90 11.35 15.91 18.17 26.66 36.22
28 1994 14.33 19.88 22.26 32.72 43.46 15.72 22.39 24.89 36.89 ... 12.45 15.79 18.90 27.01 37.24 12.07 16.47 18.86 26.70 36.96
29 1993 14.60 19.65 22.27 32.16 41.45 16.09 22.18 24.84 36.37 ... 12.73 15.84 18.70 27.45 35.67 11.94 16.25 18.79 25.57 36.49
30 1992 14.87 19.65 22.26 32.16 40.94 16.42 22.32 24.95 36.38 ... 13.21 15.73 18.98 27.01 34.10 12.18 15.79 18.89 26.66 34.62
31 1991 14.99 19.77 22.49 31.87 40.71 16.66 22.49 25.46 36.36 ... 12.61 15.84 18.91 26.81 32.58 12.19 16.11 18.51 25.40 33.75
32 1990 15.05 19.74 22.58 32.21 39.95 16.90 22.69 25.75 36.77 ... 12.20 15.63 18.77 25.51 32.97 12.07 15.93 18.33 26.63 32.33
33 1989 15.46 20.07 22.61 31.89 39.80 17.63 23.31 25.78 36.60 ... 12.14 15.69 19.00 25.97 34.39 12.13 15.95 19.17 25.12 32.88
34 1988 15.79 20.66 23.03 33.41 41.12 18.02 24.18 26.71 39.34 ... 12.32 16.17 18.67 26.04 35.80 12.27 15.77 19.43 24.59 31.65
35 1987 15.76 20.60 23.17 33.19 40.55 18.05 24.17 27.02 39.07 ... 12.60 16.03 18.76 25.21 33.45 12.31 15.97 18.92 25.01 32.33
36 1986 16.10 20.61 23.12 32.67 39.88 18.44 24.24 27.04 38.52 ... 12.78 16.09 18.48 24.60 33.04 12.18 16.35 18.68 24.92 28.90
37 1985 16.08 20.40 22.60 31.50 38.31 18.46 24.14 26.25 37.00 ... 12.45 15.87 18.30 25.15 31.28 12.40 15.89 18.38 24.12 29.40
38 1984 16.21 20.30 22.37 31.07 37.45 18.66 24.06 25.97 36.63 ... 12.39 15.65 18.04 23.66 31.81 12.39 16.25 17.79 22.23 30.82
39 1983 16.43 20.43 22.28 30.73 36.78 18.89 24.33 26.04 36.21 ... 12.92 15.94 17.71 23.73 30.64 12.35 16.02 18.94 23.17 26.31
40 1982 16.73 20.65 22.43 30.55 36.08 19.34 24.64 26.22 35.80 ... 12.81 15.95 17.88 23.54 30.65 12.67 16.48 17.68 23.98 30.21
41 1981 17.07 20.68 22.47 30.17 35.32 19.64 24.84 26.24 35.41 ... 12.88 15.86 18.14 23.82 31.05 13.02 16.13 17.49 21.29 31.21
42 1980 17.42 20.90 22.73 30.04 35.53 20.08 25.18 26.63 35.16 ... 12.98 16.09 18.28 23.99 31.58 12.87 16.16 17.86 22.60 28.23
43 1979 17.95 21.62 23.20 30.57 36.19 20.74 26.04 27.18 35.72 ... 13.33 16.30 18.54 23.84 29.34 13.20 16.27 18.02 21.94 30.87
44 1978 17.29 21.52 23.25 30.90 37.56 20.10 25.97 27.14 35.92 ... 12.67 17.00 18.24 25.47 34.08 13.08 15.72 17.56 21.26 28.97
45 1977 17.59 21.50 22.97 31.07 37.36 20.43 25.97 26.70 36.09 ... 12.80 16.30 18.87 25.83 37.10 12.66 17.22 17.12 21.49 30.31
46 1976 17.52 21.76 23.49 31.46 37.50 20.36 26.14 27.10 36.42 ... 12.75 16.40 19.11 26.10 34.92 13.79 17.29 17.40 20.29 20.93
47 1975 17.30 21.55 23.30 31.45 38.41 20.00 26.02 26.93 36.21 ... 12.48 15.88 18.11 27.93 33.03 12.87 16.04 17.04 18.85 23.63
48 1974 17.68 21.60 23.32 31.69 38.37 20.63 26.15 26.79 36.62 ... 12.31 15.55 18.19 27.80 33.86 13.24 16.29 18.17 21.70 26.74
49 1973 18.06 22.22 24.08 32.80 38.16 21.18 26.90 27.67 37.69 ... 12.02 15.79 18.11 28.35 41.81 12.92 16.46 20.22 22.23 25.10

50 rows × 61 columns

In [17]:
comparison_result = (sqldf("SELECT year, AVG(men_high_school) AS men_high_school,AVG(women_high_school) AS women_high_school,AVG(men_advanced_degree) AS men_advanced_degree, AVG(women_advanced_degree) AS women_advanced_degree FROM df GROUP BY year"))
In [18]:
print("Overview of Data Manipulation:")

plt.figure(figsize=(12, 8))
plt.plot(comparison_result['year'], comparison_result['men_high_school'], label='Men - High School')
plt.plot(comparison_result['year'], comparison_result['men_advanced_degree'], label='Men - Advanced Degree')
plt.plot(comparison_result['year'], comparison_result['women_high_school'], label='Women - High School')
plt.plot(comparison_result['year'], comparison_result['women_advanced_degree'], label='Women - Advanced Degree')

plt.title('Comparison of Average Wages for High School-educated Men and Women Over the Years')
plt.xlabel('Year')
plt.ylabel('Average Wages')
plt.legend()
plt.show()
Overview of Data Manipulation:
In [ ]: